library(tidyverse)
library(ggplot2)
library(lavaan)
library(car)
library(glmnet)
library(randomForestSRC)
library(caret)AAQoL machine learning analysis with unbalanced random forest
Data set
This data set is from the 2015 Asian American Quality of Life survey. Participants are from Austin, Texas.
Input data set
qol <- read_csv("AAQoL.csv") |> mutate(across(where(is.character), ~as.factor(.x))) |>
mutate(`English Difficulties`=relevel(`English Difficulties`,ref="Not at all"),
`English Speaking`=relevel(`English Speaking`,ref="Not at all"),
Ethnicity = relevel(Ethnicity,ref="Chinese")) |>
mutate(Income_median = case_match(Income,"$0 - $9,999"~"Below",
"$10,000 - $19,999" ~"Below",
"$20,000 - $29,999"~"Below",
"$30,000 - $39,999"~"Below",
"$40,000 - $49,999"~"Below",
"$50,000 - $59,999"~"Below",
"$60,000 - $69,999"~"Above",
"$70,000 and over"~"Above",
.default=Income)) |>
mutate(Income_median = factor(Income_median, levels=c("Below","Above")))New names:
Rows: 2609 Columns: 231
── Column specification
──────────────────────────────────────────────────────── Delimiter: "," chr
(190): Gender, Ethnicity, Marital Status, No One, Spouse, Children, Gran... dbl
(41): Survey ID, Age, Education Completed, Household Size, Grandparent,...
ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
Specify the column types or set `show_col_types = FALSE` to quiet this message.
• `Other` -> `Other...17`
• `Other` -> `Other...89`
qol |> DT::datatable()Warning in instance$preRenderHook(instance): It seems your data is too big for
client-side DataTables. You may consider server-side processing:
https://rstudio.github.io/DT/server.html
Source of Information: Family
ps(Family)# A tibble: 4 × 3
Family n pct
<fct> <int> <dbl>
1 3 1 0.0383
2 No 1258 48.2
3 Yes 1331 51.0
4 <NA> 19 0.728
rfdata <- qol |> filter(Family %in% c("No","Yes")) |>
mutate(Family=droplevels(Family)) |>
select(Family, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
# filter(!is.na(Family)) |>
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame()
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Family ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")
print(rfobj) Sample size: 2187
Frequency of class labels: 1069, 1118
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 481.777
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1382
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0458
(OOB) Brier score: 0.23052563
(OOB) Normalized Brier score: 0.92210253
(OOB) AUC: 0.65475128
(OOB) Log-loss: 0.6518979
(OOB) PR-AUC: 0.61704468
(OOB) G-mean: 0.60961421
(OOB) Requested performance error: 0.39038579
Confusion matrix:
predicted
observed No Yes class.error
No 705 364 0.3405
Yes 488 630 0.4365
(OOB) Misclassification rate: 0.3895748
print(rfobj) Sample size: 2187
Frequency of class labels: 1069, 1118
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 481.777
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1382
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0458
(OOB) Brier score: 0.23052563
(OOB) Normalized Brier score: 0.92210253
(OOB) AUC: 0.65475128
(OOB) Log-loss: 0.6518979
(OOB) PR-AUC: 0.61704468
(OOB) G-mean: 0.60961421
(OOB) Requested performance error: 0.39038579
Confusion matrix:
predicted
observed No Yes class.error
No 705 364 0.3405
Yes 488 630 0.4365
(OOB) Misclassification rate: 0.3895748
plot(rfobj,plots.one.page = FALSE)

all No Yes
Age 0.0316 NA NA
Ethnicity 0.0278 NA NA
EnglishSpeak 0.0130 NA NA
See Family 0.0094 NA NA
Similar Values 0.0080 NA NA
Family Respect 0.0060 NA NA
Community Trust 0.0044 NA NA
EnglishDiff 0.0042 NA NA
Gender 0.0040 NA NA
Get Along 0.0035 NA NA
Helpful Community 0.0033 NA NA
Religion 0.0032 NA NA
Family Pride 0.0031 NA NA
See Friends 0.0028 NA NA
Spend Time Together 0.0024 NA NA
Close Friends 0.0023 NA NA
Trust 0.0023 NA NA
Religious Importance 0.0020 NA NA
Income_median 0.0019 NA NA
Close-knit Community 0.0017 NA NA
Community Shares Values 0.0015 NA NA
Loyalty 0.0015 NA NA
Feel Close 0.0008 NA NA
Expression 0.0003 NA NA
Religious Attendance -0.0006 NA NA
Togetherness -0.0009 NA NA
rfobj$importance all No Yes
Ethnicity 0.0277891088 NA NA
Age 0.0316126279 NA NA
Gender 0.0040003578 NA NA
Religion 0.0031809281 NA NA
Employment -0.0019746339 NA NA
Income_median 0.0019383649 NA NA
EnglishSpeak 0.0130168401 NA NA
EnglishDiff 0.0042021061 NA NA
See Family 0.0093535682 NA NA
Close Family -0.0012422567 NA NA
Helpful Family -0.0046550437 NA NA
See Friends 0.0028436753 NA NA
Close Friends 0.0023006026 NA NA
Helpful Friends -0.0075217884 NA NA
Family Respect 0.0060014684 NA NA
Similar Values 0.0080321841 NA NA
Successful Family -0.0055411576 NA NA
Trust 0.0022647824 NA NA
Loyalty 0.0014531945 NA NA
Family Pride 0.0031305696 NA NA
Expression 0.0002745701 NA NA
Spend Time Together 0.0024204779 NA NA
Feel Close 0.0008158354 NA NA
Togetherness -0.0009161691 NA NA
Religious Attendance -0.0006420113 NA NA
Religious Importance 0.0019934441 NA NA
Close-knit Community 0.0016836881 NA NA
Helpful Community 0.0032857974 NA NA
Community Shares Values 0.0015075418 NA NA
Get Along 0.0035156076 NA NA
Community Trust 0.0044315729 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
pos<- rfdata |> filter(Family=="Yes")
neg <- rfdata |> filter(Family=="No")
set.seed(222)
imbal_index <- caret::createDataPartition(rfdata$Family,p=0.75,list=F,times=1)
train <- rfdata[imbal_index,]
test<- rfdata[-imbal_index,]
rfsrc(Family~.,data=train, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
# rfobj <- imbalanced(Family ~ .,importance=T,data=train,
# perf.type = "gmean",splitrule="gini")
print(rfobj) Sample size: 1641
Frequency of class labels: 802, 839
Number of trees: 500
Forest terminal node size: 1
Average no. of terminal nodes: 361.26
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1037
Analysis: RF-C
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0461
(OOB) Brier score: 0.23541641
(OOB) Normalized Brier score: 0.94166563
(OOB) AUC: 0.63563863
(OOB) Log-loss: 0.66299671
(OOB) PR-AUC: 0.59083227
(OOB) G-mean: 0.59246736
(OOB) Requested performance error: 0.40819021
Confusion matrix:
predicted
observed No Yes class.error
No 481 321 0.4002
Yes 350 489 0.4172
(OOB) Misclassification rate: 0.408897
print(rfobj) Sample size: 1641
Frequency of class labels: 802, 839
Number of trees: 500
Forest terminal node size: 1
Average no. of terminal nodes: 361.26
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1037
Analysis: RF-C
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0461
(OOB) Brier score: 0.23541641
(OOB) Normalized Brier score: 0.94166563
(OOB) AUC: 0.63563863
(OOB) Log-loss: 0.66299671
(OOB) PR-AUC: 0.59083227
(OOB) G-mean: 0.59246736
(OOB) Requested performance error: 0.40819021
Confusion matrix:
predicted
observed No Yes class.error
No 481 321 0.4002
Yes 350 489 0.4172
(OOB) Misclassification rate: 0.408897
plot(rfobj,plots.one.page = FALSE)

all No Yes
Age 0.0256 NA NA
Ethnicity 0.0124 NA NA
See Friends 0.0071 NA NA
Helpful Family 0.0056 NA NA
Close Family 0.0049 NA NA
Get Along 0.0037 NA NA
Spend Time Together 0.0034 NA NA
EnglishDiff 0.0027 NA NA
Close-knit Community 0.0027 NA NA
EnglishSpeak 0.0024 NA NA
Family Respect 0.0020 NA NA
Feel Close 0.0019 NA NA
Similar Values 0.0019 NA NA
Trust 0.0019 NA NA
Togetherness 0.0016 NA NA
Helpful Community 0.0015 NA NA
Community Shares Values 0.0015 NA NA
Expression 0.0013 NA NA
Loyalty 0.0011 NA NA
Close Friends 0.0011 NA NA
Helpful Friends 0.0009 NA NA
Gender 0.0009 NA NA
Religious Importance 0.0007 NA NA
Community Trust 0.0007 NA NA
Income_median 0.0006 NA NA
Employment 0.0005 NA NA
rfobj$importance all No Yes
Ethnicity 0.0124488512 NA NA
Age 0.0256417238 NA NA
Gender 0.0008643104 NA NA
Religion 0.0003919468 NA NA
Employment 0.0005279885 NA NA
Income_median 0.0005553531 NA NA
EnglishSpeak 0.0023720548 NA NA
EnglishDiff 0.0027370131 NA NA
See Family -0.0003238129 NA NA
Close Family 0.0049270472 NA NA
Helpful Family 0.0055638915 NA NA
See Friends 0.0070965402 NA NA
Close Friends 0.0010713659 NA NA
Helpful Friends 0.0009139973 NA NA
Family Respect 0.0020377689 NA NA
Similar Values 0.0018846736 NA NA
Successful Family -0.0007818162 NA NA
Trust 0.0018536063 NA NA
Loyalty 0.0010989735 NA NA
Family Pride 0.0004530480 NA NA
Expression 0.0013203062 NA NA
Spend Time Together 0.0034320537 NA NA
Feel Close 0.0019263946 NA NA
Togetherness 0.0016238509 NA NA
Religious Attendance -0.0005669903 NA NA
Religious Importance 0.0007134177 NA NA
Close-knit Community 0.0026958162 NA NA
Helpful Community 0.0015000579 NA NA
Community Shares Values 0.0014710505 NA NA
Get Along 0.0037314878 NA NA
Community Trust 0.0006983894 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance = T)
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
279.0000000 267.0000000 1.0449438 0.4890110 0.6441948 0.5555556
prec npv misclass brier brier.norm auc
0.5810811 0.6200000 0.4010989 0.2342201 0.9368802 0.6376102
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.6595606 0.6110124 0.5982508 0.4890110 0.6059950 0.6046241
F1modgmean gmean
0.5982433 0.5982357
test_rf$importance all No Yes
Ethnicity 0.0155882728 NA NA
Age 0.0335115708 NA NA
Gender 0.0048163780 NA NA
Religion 0.0051198457 NA NA
Employment 0.0004498480 NA NA
Income_median 0.0033055926 NA NA
EnglishSpeak 0.0021304424 NA NA
EnglishDiff 0.0115481464 NA NA
See Family 0.0031161807 NA NA
Close Family 0.0107349956 NA NA
Helpful Family 0.0101640966 NA NA
See Friends 0.0045042555 NA NA
Close Friends 0.0057056810 NA NA
Helpful Friends 0.0062768016 NA NA
Family Respect 0.0045915047 NA NA
Similar Values 0.0032233405 NA NA
Successful Family 0.0023504195 NA NA
Trust 0.0019277831 NA NA
Loyalty 0.0035133417 NA NA
Family Pride -0.0003255301 NA NA
Expression 0.0013490105 NA NA
Spend Time Together 0.0006522597 NA NA
Feel Close 0.0024226706 NA NA
Togetherness 0.0024323453 NA NA
Religious Attendance 0.0011744121 NA NA
Religious Importance -0.0014256837 NA NA
Close-knit Community -0.0011670545 NA NA
Helpful Community 0.0061264047 NA NA
Community Shares Values 0.0045657712 NA NA
Get Along 0.0001436143 NA NA
Community Trust 0.0027542658 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot
Source of Information: Health Professionals
ps(`Heal Professionals`)# A tibble: 3 × 3
`Heal Professionals` n pct
<fct> <int> <dbl>
1 No 1326 50.8
2 Yes 1264 48.4
3 <NA> 19 0.728
rfdata <- qol |>
select(`Heal Professionals`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame()
imbalanced(`Heal Professionals` ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")->rfobj
print(rfobj) Sample size: 2188
Frequency of class labels: 1067, 1121
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 482.5363
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1383
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0506
(OOB) Brier score: 0.23144405
(OOB) Normalized Brier score: 0.92577618
(OOB) AUC: 0.65773693
(OOB) Log-loss: 0.65531206
(OOB) PR-AUC: 0.62771089
(OOB) G-mean: 0.61988973
(OOB) Requested performance error: 0.38011027
Confusion matrix:
predicted
observed No Yes class.error
No 670 397 0.3721
Yes 435 686 0.3880
(OOB) Misclassification rate: 0.3802559
plot(rfobj,plots.one.page = FALSE)

all No Yes
EnglishSpeak 0.0289 NA NA
Close Friends 0.0137 NA NA
Income_median 0.0123 NA NA
See Friends 0.0091 NA NA
Age 0.0077 NA NA
Community Trust 0.0064 NA NA
Religious Attendance 0.0064 NA NA
See Family 0.0059 NA NA
Close Family 0.0055 NA NA
Employment 0.0046 NA NA
Community Shares Values 0.0041 NA NA
Expression 0.0037 NA NA
Helpful Family 0.0037 NA NA
Helpful Community 0.0032 NA NA
Close-knit Community 0.0032 NA NA
EnglishDiff 0.0028 NA NA
Trust 0.0024 NA NA
Family Pride 0.0024 NA NA
Religious Importance 0.0023 NA NA
Family Respect 0.0018 NA NA
Similar Values 0.0014 NA NA
Gender 0.0009 NA NA
Togetherness 0.0006 NA NA
Feel Close 0.0006 NA NA
Successful Family 0.0001 NA NA
Religion 0.0000 NA NA
rfobj$importance all No Yes
Ethnicity -1.677537e-03 NA NA
Age 7.733131e-03 NA NA
Gender 8.772761e-04 NA NA
Religion 3.843916e-05 NA NA
Employment 4.561907e-03 NA NA
Income_median 1.230494e-02 NA NA
EnglishSpeak 2.885256e-02 NA NA
EnglishDiff 2.767643e-03 NA NA
See Family 5.928918e-03 NA NA
Close Family 5.511011e-03 NA NA
Helpful Family 3.711948e-03 NA NA
See Friends 9.133932e-03 NA NA
Close Friends 1.367838e-02 NA NA
Helpful Friends -4.988185e-04 NA NA
Family Respect 1.791640e-03 NA NA
Similar Values 1.402887e-03 NA NA
Successful Family 7.081082e-05 NA NA
Trust 2.405354e-03 NA NA
Loyalty -4.967590e-03 NA NA
Family Pride 2.385045e-03 NA NA
Expression 3.728908e-03 NA NA
Spend Time Together -4.533935e-03 NA NA
Feel Close 5.761629e-04 NA NA
Togetherness 6.207130e-04 NA NA
Religious Attendance 6.357330e-03 NA NA
Religious Importance 2.263205e-03 NA NA
Close-knit Community 3.177590e-03 NA NA
Helpful Community 3.185724e-03 NA NA
Community Shares Values 4.146285e-03 NA NA
Get Along -2.654628e-03 NA NA
Community Trust 6.374364e-03 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
pos<- rfdata |> filter(`Heal Professionals`=="Yes")
neg <- rfdata |> filter(`Heal Professionals`==0)
set.seed(222)
imbal_index <- createDataPartition(rfdata$`Heal Professionals`,p=0.75,list=F,times=1)
train <- rfdata[imbal_index,]
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(`Heal Professionals` ~ .,importance=T,data=train,
perf.type = "gmean",splitrule="gini")
print(rfobj) Sample size: 1642
Frequency of class labels: 801, 841
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 362.0227
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1038
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0499
(OOB) Brier score: 0.23293394
(OOB) Normalized Brier score: 0.93173574
(OOB) AUC: 0.65048668
(OOB) Log-loss: 0.65855641
(OOB) PR-AUC: 0.62608067
(OOB) G-mean: 0.60369638
(OOB) Requested performance error: 0.39630362
Confusion matrix:
predicted
observed No Yes class.error
No 492 309 0.3858
Yes 342 499 0.4067
(OOB) Misclassification rate: 0.3964677
print(rfobj) Sample size: 1642
Frequency of class labels: 801, 841
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 362.0227
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1038
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0499
(OOB) Brier score: 0.23293394
(OOB) Normalized Brier score: 0.93173574
(OOB) AUC: 0.65048668
(OOB) Log-loss: 0.65855641
(OOB) PR-AUC: 0.62608067
(OOB) G-mean: 0.60369638
(OOB) Requested performance error: 0.39630362
Confusion matrix:
predicted
observed No Yes class.error
No 492 309 0.3858
Yes 342 499 0.4067
(OOB) Misclassification rate: 0.3964677
plot(rfobj,plots.one.page = FALSE)

all No Yes
See Friends 0.0104 NA NA
Income_median 0.0094 NA NA
EnglishSpeak 0.0092 NA NA
Religious Attendance 0.0067 NA NA
Community Shares Values 0.0039 NA NA
Helpful Community 0.0025 NA NA
Close Friends 0.0024 NA NA
Age 0.0018 NA NA
Community Trust 0.0018 NA NA
Religion 0.0018 NA NA
Close-knit Community 0.0000 NA NA
Family Respect -0.0006 NA NA
Gender -0.0006 NA NA
Trust -0.0012 NA NA
Get Along -0.0023 NA NA
Feel Close -0.0030 NA NA
Close Family -0.0030 NA NA
EnglishDiff -0.0035 NA NA
Helpful Friends -0.0036 NA NA
Family Pride -0.0042 NA NA
Spend Time Together -0.0042 NA NA
Togetherness -0.0048 NA NA
Helpful Family -0.0049 NA NA
Successful Family -0.0054 NA NA
Expression -0.0055 NA NA
Ethnicity -0.0066 NA NA
rfobj$importance all No Yes
Ethnicity -6.617697e-03 NA NA
Age 1.823620e-03 NA NA
Gender -6.193423e-04 NA NA
Religion 1.817454e-03 NA NA
Employment -7.934585e-03 NA NA
Income_median 9.442515e-03 NA NA
EnglishSpeak 9.223977e-03 NA NA
EnglishDiff -3.508588e-03 NA NA
See Family -1.524012e-02 NA NA
Close Family -3.016992e-03 NA NA
Helpful Family -4.851720e-03 NA NA
See Friends 1.036126e-02 NA NA
Close Friends 2.422024e-03 NA NA
Helpful Friends -3.645479e-03 NA NA
Family Respect -6.193423e-04 NA NA
Similar Values -7.929731e-03 NA NA
Successful Family -5.419827e-03 NA NA
Trust -1.225781e-03 NA NA
Loyalty -7.875119e-03 NA NA
Family Pride -4.235469e-03 NA NA
Expression -5.496590e-03 NA NA
Spend Time Together -4.248899e-03 NA NA
Feel Close -2.982737e-03 NA NA
Togetherness -4.754138e-03 NA NA
Religious Attendance 6.730834e-03 NA NA
Religious Importance -1.036653e-02 NA NA
Close-knit Community 2.213112e-05 NA NA
Helpful Community 2.533134e-03 NA NA
Community Shares Values 3.900228e-03 NA NA
Get Along -2.316820e-03 NA NA
Community Trust 1.817454e-03 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
280.0000000 266.0000000 1.0526316 0.4871795 0.6654135 0.6214286
prec npv misclass brier brier.norm auc
0.6254417 0.6615970 0.3571429 0.2282976 0.9131906 0.6750101
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.6490824 0.6448087 0.6428404 0.4871795 0.6409420 0.6439269
F1modgmean gmean
0.6429427 0.6430451
test_rf$importance all No Yes
Ethnicity 0.0091830578 NA NA
Age 0.0033678960 NA NA
Gender -0.0001164933 NA NA
Religion 0.0058558745 NA NA
Employment -0.0007179757 NA NA
Income_median 0.0106624244 NA NA
EnglishSpeak 0.0283652914 NA NA
EnglishDiff 0.0090379447 NA NA
See Family 0.0034879006 NA NA
Close Family 0.0003110750 NA NA
Helpful Family 0.0008932684 NA NA
See Friends 0.0060705775 NA NA
Close Friends 0.0054451949 NA NA
Helpful Friends 0.0030227785 NA NA
Family Respect -0.0002876884 NA NA
Similar Values 0.0013104850 NA NA
Successful Family 0.0022066150 NA NA
Trust 0.0015505719 NA NA
Loyalty 0.0018317141 NA NA
Family Pride 0.0016332540 NA NA
Expression 0.0021782431 NA NA
Spend Time Together 0.0000550326 NA NA
Feel Close 0.0005144554 NA NA
Togetherness -0.0004880158 NA NA
Religious Attendance 0.0024238396 NA NA
Religious Importance 0.0003718744 NA NA
Close-knit Community 0.0007995488 NA NA
Helpful Community 0.0016636832 NA NA
Community Shares Values 0.0011167880 NA NA
Get Along 0.0002869137 NA NA
Community Trust 0.0027582174 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot
Health Insurance
ps(`Health Insurance`)# A tibble: 3 × 3
`Health Insurance` n pct
<fct> <int> <dbl>
1 0 381 14.6
2 Yes 2207 84.6
3 <NA> 21 0.805
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Health Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(Health.Insurance ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")
print(imb) Sample size: 2189
Frequency of class labels: 292, 1897
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 258.8407
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1383
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 6.4966
(OOB) Brier score: 0.10382762
(OOB) Normalized Brier score: 0.41531048
(OOB) AUC: 0.74522678
(OOB) Log-loss: 0.34769219
(OOB) PR-AUC: 0.33319584
(OOB) G-mean: 0.66543552
(OOB) Requested performance error: 0.33456448
Confusion matrix:
predicted
observed 0 Yes class.error
0 224 68 0.2329
Yes 802 1095 0.4228
(OOB) Misclassification rate: 0.3974418
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1897.0000000 292.0000000 6.4965753 0.1333942 0.7671233 0.5772272
prec npv misclass brier brier.norm auc
0.2183236 0.9415305 0.3974418 0.1038276 0.4153105 0.7452268
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.3476922 0.3399090 0.4609118 0.1333942 0.3331958 0.5026722
F1modgmean gmean
0.5631737 0.6654355
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
# ind_pos <- sample(c(0,1), nrow(pos), replace = T, prob = c(0.7, 0.3))
# ind_neg <- sample(c(0,1), nrow(neg), replace = T, prob = c(0.7, 0.3))
#
#
# train <- bind_rows(pos[ind_pos==0,],neg[ind_neg==0,])
# test <- bind_rows(pos[ind_pos==1,],neg[ind_neg==1,])
imbal_index <- createDataPartition(rfdata$Health.Insurance,p=0.75,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Health.Insurance~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Health.Insurance ~ .,importance=T,data=train,
perf.type = "gmean",splitrule="gini")
print(rfobj) Sample size: 1642
Frequency of class labels: 809, 833
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 212.829
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1038
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0297
(OOB) Brier score: 0.07131577
(OOB) Normalized Brier score: 0.28526307
(OOB) AUC: 0.98914077
(OOB) Log-loss: 0.27271079
(OOB) PR-AUC: 0.9888072
(OOB) G-mean: 0.94745674
(OOB) Requested performance error: 0.05254326
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 745 64 0.0791
0 21 812 0.0252
(OOB) Misclassification rate: 0.05176614
print(rfobj) Sample size: 1642
Frequency of class labels: 809, 833
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 212.829
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1038
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0297
(OOB) Brier score: 0.07131577
(OOB) Normalized Brier score: 0.28526307
(OOB) AUC: 0.98914077
(OOB) Log-loss: 0.27271079
(OOB) PR-AUC: 0.9888072
(OOB) G-mean: 0.94745674
(OOB) Requested performance error: 0.05254326
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 745 64 0.0791
0 21 812 0.0252
(OOB) Misclassification rate: 0.05176614
plot(rfobj,plots.one.page = FALSE)

all Yes 0
Ethnicity 0.0251 NA NA
Religion 0.0174 NA NA
EnglishSpeak 0.0164 NA NA
Income_median 0.0146 NA NA
Employment 0.0135 NA NA
Religious.Attendance 0.0134 NA NA
EnglishDiff 0.0108 NA NA
Helpful.Community 0.0099 NA NA
Religious.Importance 0.0090 NA NA
Close.knit.Community 0.0077 NA NA
Community.Trust 0.0076 NA NA
Helpful.Family 0.0072 NA NA
Get.Along 0.0070 NA NA
See.Friends 0.0066 NA NA
Age 0.0056 NA NA
Community.Shares.Values 0.0054 NA NA
Gender 0.0054 NA NA
Similar.Values 0.0049 NA NA
Helpful.Friends 0.0049 NA NA
Feel.Close 0.0041 NA NA
Expression 0.0041 NA NA
Close.Friends 0.0031 NA NA
Family.Respect 0.0030 NA NA
Togetherness 0.0030 NA NA
Close.Family 0.0025 NA NA
Successful.Family 0.0024 NA NA
rfobj$importance all Yes 0
Ethnicity 0.0250746224 NA NA
Age 0.0055677537 NA NA
Gender 0.0053598163 NA NA
Religion 0.0173662061 NA NA
Employment 0.0135148040 NA NA
Income_median 0.0145521135 NA NA
EnglishSpeak 0.0164197923 NA NA
EnglishDiff 0.0108215914 NA NA
See.Family 0.0004832958 NA NA
Close.Family 0.0024919682 NA NA
Helpful.Family 0.0072131811 NA NA
See.Friends 0.0066286342 NA NA
Close.Friends 0.0030747402 NA NA
Helpful.Friends 0.0048771687 NA NA
Family.Respect 0.0030221032 NA NA
Similar.Values 0.0048771687 NA NA
Successful.Family 0.0024385785 NA NA
Trust 0.0012192878 NA NA
Loyalty 0.0011675380 NA NA
Family.Pride 0.0011675380 NA NA
Expression 0.0041406824 NA NA
Spend.Time.Together -0.0001017975 NA NA
Feel.Close 0.0041406824 NA NA
Togetherness 0.0029710402 NA NA
Religious.Attendance 0.0133599027 NA NA
Religious.Importance 0.0089690063 NA NA
Close.knit.Community 0.0077033446 NA NA
Helpful.Community 0.0099316404 NA NA
Community.Shares.Values 0.0054094335 NA NA
Get.Along 0.0069843678 NA NA
Community.Trust 0.0076149226 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
474.0000000 73.0000000 6.4931507 0.1334552 0.9589041 0.1962025
prec npv misclass brier brier.norm auc
0.1552106 0.9687500 0.7020110 0.1512833 0.6051333 0.6777065
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.4682492 0.2671756 0.2937991 0.1334552 0.2236314 0.3504630
F1modgmean gmean
0.3637747 0.4337504
test_rf$importance all Yes 0
Ethnicity -1.043120e-02 NA NA
Age 8.892604e-03 NA NA
Gender 3.965169e-03 NA NA
Religion 7.176509e-03 NA NA
Employment 5.141174e-02 NA NA
Income_median 9.433785e-02 NA NA
EnglishSpeak 4.047334e-02 NA NA
EnglishDiff 2.793042e-03 NA NA
See.Family -4.845609e-03 NA NA
Close.Family -4.077925e-03 NA NA
Helpful.Family -7.243761e-03 NA NA
See.Friends -6.401686e-03 NA NA
Close.Friends -1.795732e-03 NA NA
Helpful.Friends -3.495917e-03 NA NA
Family.Respect -1.679729e-03 NA NA
Similar.Values -1.288285e-04 NA NA
Successful.Family 8.413456e-05 NA NA
Trust -3.280590e-03 NA NA
Loyalty 1.298347e-03 NA NA
Family.Pride 3.210424e-03 NA NA
Expression 3.092767e-03 NA NA
Spend.Time.Together -3.346290e-03 NA NA
Feel.Close 2.436198e-03 NA NA
Togetherness 1.673344e-03 NA NA
Religious.Attendance 7.593028e-03 NA NA
Religious.Importance 2.386402e-03 NA NA
Close.knit.Community 1.967129e-02 NA NA
Helpful.Community 3.244624e-02 NA NA
Community.Shares.Values 1.121431e-02 NA NA
Get.Along 1.284339e-02 NA NA
Community.Trust 1.648031e-02 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot
Dental Insurance
ps(`Dental Insurance`)# A tibble: 3 × 3
`Dental Insurance` n pct
<fct> <int> <dbl>
1 0 1050 40.2
2 Yes 1529 58.6
3 <NA> 30 1.15
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Dental Insurance`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(Dental.Insurance ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")
print(imb) Sample size: 2184
Frequency of class labels: 849, 1335
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 397.6117
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1380
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.5724
(OOB) Brier score: 0.1774567
(OOB) Normalized Brier score: 0.70982678
(OOB) AUC: 0.79679575
(OOB) Log-loss: 0.53238987
(OOB) PR-AUC: 0.7016539
(OOB) G-mean: 0.73581395
(OOB) Requested performance error: 0.26418605
Confusion matrix:
predicted
observed 0 Yes class.error
0 648 201 0.2367
Yes 388 947 0.2906
(OOB) Misclassification rate: 0.2696886
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1335.0000000 849.0000000 1.5724382 0.3887363 0.7632509 0.7093633
prec npv misclass brier brier.norm auc
0.6254826 0.8249129 0.2696886 0.1774567 0.7098268 0.7967957
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.5323899 0.6875332 0.7232077 0.3887363 0.7016539 0.7116736
F1modgmean gmean
0.7295108 0.7358139
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
imbal_index <- createDataPartition(rfdata$Dental.Insurance,p=0.75,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Dental.Insurance~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Dental.Insurance ~ .,importance=T,data=train,
perf.type = "gmean",splitrule="gini")
print(rfobj) Sample size: 1639
Frequency of class labels: 807, 832
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 237.8013
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1036
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.031
(OOB) Brier score: 0.10888745
(OOB) Normalized Brier score: 0.43554981
(OOB) AUC: 0.94193758
(OOB) Log-loss: 0.36875306
(OOB) PR-AUC: 0.93722367
(OOB) G-mean: 0.87409144
(OOB) Requested performance error: 0.12590856
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 697 110 0.1363
0 96 736 0.1154
(OOB) Misclassification rate: 0.1256864
print(rfobj) Sample size: 1639
Frequency of class labels: 807, 832
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 237.8013
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1036
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.031
(OOB) Brier score: 0.10888745
(OOB) Normalized Brier score: 0.43554981
(OOB) AUC: 0.94193758
(OOB) Log-loss: 0.36875306
(OOB) PR-AUC: 0.93722367
(OOB) G-mean: 0.87409144
(OOB) Requested performance error: 0.12590856
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 697 110 0.1363
0 96 736 0.1154
(OOB) Misclassification rate: 0.1256864
plot(rfobj,plots.one.page = FALSE)

all Yes 0
Income_median 0.0391 NA NA
EnglishSpeak 0.0217 NA NA
Employment 0.0186 NA NA
Religion 0.0177 NA NA
EnglishDiff 0.0157 NA NA
Ethnicity 0.0145 NA NA
Religious.Importance 0.0108 NA NA
Religious.Attendance 0.0096 NA NA
Get.Along 0.0060 NA NA
Community.Trust 0.0049 NA NA
Helpful.Family 0.0048 NA NA
Close.Friends 0.0042 NA NA
Age 0.0035 NA NA
Community.Shares.Values 0.0030 NA NA
Spend.Time.Together 0.0029 NA NA
Gender 0.0025 NA NA
Close.knit.Community 0.0024 NA NA
Family.Respect 0.0024 NA NA
Togetherness 0.0023 NA NA
Family.Pride 0.0023 NA NA
Loyalty 0.0023 NA NA
Expression 0.0018 NA NA
Trust 0.0018 NA NA
Close.Family 0.0012 NA NA
Feel.Close 0.0012 NA NA
Successful.Family 0.0012 NA NA
rfobj$importance all Yes 0
Ethnicity 1.450301e-02 NA NA
Age 3.493177e-03 NA NA
Gender 2.475878e-03 NA NA
Religion 1.768573e-02 NA NA
Employment 1.862361e-02 NA NA
Income_median 3.914102e-02 NA NA
EnglishSpeak 2.173959e-02 NA NA
EnglishDiff 1.569938e-02 NA NA
See.Family 1.157721e-03 NA NA
Close.Family 1.220851e-03 NA NA
Helpful.Family 4.763477e-03 NA NA
See.Friends 4.780771e-04 NA NA
Close.Friends 4.224828e-03 NA NA
Helpful.Friends -6.304234e-05 NA NA
Family.Respect 2.378485e-03 NA NA
Similar.Values -9.200622e-05 NA NA
Successful.Family 1.157721e-03 NA NA
Trust 1.814843e-03 NA NA
Loyalty 2.322104e-03 NA NA
Family.Pride 2.349440e-03 NA NA
Expression 1.814843e-03 NA NA
Spend.Time.Together 2.945910e-03 NA NA
Feel.Close 1.157721e-03 NA NA
Togetherness 2.349440e-03 NA NA
Religious.Attendance 9.573872e-03 NA NA
Religious.Importance 1.081689e-02 NA NA
Close.knit.Community 2.409239e-03 NA NA
Helpful.Community 4.780771e-04 NA NA
Community.Shares.Values 2.974120e-03 NA NA
Get.Along 5.984164e-03 NA NA
Community.Trust 4.883412e-03 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
333.0000000 212.0000000 1.5707547 0.3889908 0.8113208 0.5705706
prec npv misclass brier brier.norm auc
0.5460317 0.8260870 0.3357798 0.1882569 0.7530276 0.7770377
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.5578571 0.6527514 0.6636678 0.3889908 0.6956436 0.6665653
F1modgmean gmean
0.6720235 0.6803791
test_rf$importance all Yes 0
Ethnicity 7.803448e-03 NA NA
Age 8.560578e-03 NA NA
Gender -9.026692e-04 NA NA
Religion 3.745040e-03 NA NA
Employment 1.977079e-02 NA NA
Income_median 7.812789e-02 NA NA
EnglishSpeak 2.458840e-02 NA NA
EnglishDiff 4.502704e-03 NA NA
See.Family 4.685076e-05 NA NA
Close.Family 6.759530e-05 NA NA
Helpful.Family -5.780287e-06 NA NA
See.Friends -3.527108e-03 NA NA
Close.Friends -2.801842e-04 NA NA
Helpful.Friends 1.002257e-03 NA NA
Family.Respect -6.529591e-04 NA NA
Similar.Values 2.538353e-05 NA NA
Successful.Family 1.098977e-03 NA NA
Trust 9.871958e-05 NA NA
Loyalty 1.004083e-03 NA NA
Family.Pride -8.173244e-04 NA NA
Expression 1.551758e-03 NA NA
Spend.Time.Together -8.461073e-05 NA NA
Feel.Close 6.082376e-04 NA NA
Togetherness -7.650017e-04 NA NA
Religious.Attendance 1.515353e-03 NA NA
Religious.Importance -3.489919e-04 NA NA
Close.knit.Community 5.696589e-04 NA NA
Helpful.Community -5.733979e-04 NA NA
Community.Shares.Values 6.702416e-04 NA NA
Get.Along 2.021478e-03 NA NA
Community.Trust -9.908891e-06 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot
Physical Checkup
ps(`Physical Check-up`)# A tibble: 3 × 3
`Physical Check-up` n pct
<fct> <int> <dbl>
1 0 833 31.9
2 Yes 1740 66.7
3 <NA> 36 1.38
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Physical Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(Physical.Check.up ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")
print(imb) Sample size: 2178
Frequency of class labels: 704, 1474
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 425.687
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1376
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 2.0938
(OOB) Brier score: 0.19839145
(OOB) Normalized Brier score: 0.7935658
(OOB) AUC: 0.68816397
(OOB) Log-loss: 0.58312274
(OOB) PR-AUC: 0.50025633
(OOB) G-mean: 0.6427269
(OOB) Requested performance error: 0.3572731
Confusion matrix:
predicted
observed 0 Yes class.error
0 495 209 0.2969
Yes 608 866 0.4125
(OOB) Misclassification rate: 0.3751148
plot(imb,plots.one.page = F)

all 0 Yes
Age 0.0529 NA NA
Income_median 0.0250 NA NA
Employment 0.0139 NA NA
Ethnicity 0.0136 NA NA
EnglishSpeak 0.0079 NA NA
Gender 0.0063 NA NA
Religion 0.0056 NA NA
EnglishDiff 0.0053 NA NA
Helpful.Community 0.0030 NA NA
Expression 0.0029 NA NA
Close.knit.Community 0.0026 NA NA
Feel.Close 0.0025 NA NA
Religious.Importance 0.0024 NA NA
Togetherness 0.0008 NA NA
See.Friends 0.0005 NA NA
Close.Family 0.0000 NA NA
See.Family -0.0002 NA NA
Loyalty -0.0006 NA NA
Helpful.Family -0.0006 NA NA
Get.Along -0.0014 NA NA
Religious.Attendance -0.0016 NA NA
Community.Trust -0.0018 NA NA
Similar.Values -0.0019 NA NA
Helpful.Friends -0.0021 NA NA
Family.Pride -0.0021 NA NA
Family.Respect -0.0023 NA NA
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1474.0000000 704.0000000 2.0937500 0.3232323 0.7031250 0.5875170
prec npv misclass brier brier.norm auc
0.4487761 0.8055814 0.3751148 0.1983915 0.7935658 0.6881640
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.5831227 0.5478694 0.6066191 0.3232323 0.5002563 0.5952981
F1modgmean gmean
0.6246730 0.6427269
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
imbal_index <- createDataPartition(rfdata$Physical.Check.up,p=0.75,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Physical.Check.up~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Physical.Check.up~ .,importance=T,data=train,
perf.type = "gmean",splitrule="gini")
print(rfobj) Sample size: 1634
Frequency of class labels: 803, 831
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 269.5033
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1033
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0349
(OOB) Brier score: 0.13265442
(OOB) Normalized Brier score: 0.53061769
(OOB) AUC: 0.93201562
(OOB) Log-loss: 0.43239367
(OOB) PR-AUC: 0.92794805
(OOB) G-mean: 0.84812364
(OOB) Requested performance error: 0.15187636
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 677 126 0.1569
0 122 709 0.1468
(OOB) Misclassification rate: 0.1517748
print(rfobj) Sample size: 1634
Frequency of class labels: 803, 831
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 269.5033
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1033
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0349
(OOB) Brier score: 0.13265442
(OOB) Normalized Brier score: 0.53061769
(OOB) AUC: 0.93201562
(OOB) Log-loss: 0.43239367
(OOB) PR-AUC: 0.92794805
(OOB) G-mean: 0.84812364
(OOB) Requested performance error: 0.15187636
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 677 126 0.1569
0 122 709 0.1468
(OOB) Misclassification rate: 0.1517748
plot(rfobj,plots.one.page = FALSE)

all Yes 0
EnglishDiff 0.0384 NA NA
Ethnicity 0.0249 NA NA
Income_median 0.0191 NA NA
Close.knit.Community 0.0151 NA NA
Age 0.0141 NA NA
Religious.Attendance 0.0139 NA NA
Gender 0.0128 NA NA
Employment 0.0120 NA NA
Similar.Values 0.0109 NA NA
Community.Trust 0.0102 NA NA
Get.Along 0.0096 NA NA
Successful.Family 0.0090 NA NA
Community.Shares.Values 0.0090 NA NA
Religion 0.0090 NA NA
Religious.Importance 0.0078 NA NA
Expression 0.0072 NA NA
EnglishSpeak 0.0072 NA NA
Trust 0.0066 NA NA
Togetherness 0.0060 NA NA
Helpful.Community 0.0060 NA NA
Helpful.Friends 0.0055 NA NA
Feel.Close 0.0047 NA NA
Helpful.Family 0.0042 NA NA
Family.Respect 0.0041 NA NA
See.Family 0.0036 NA NA
Loyalty 0.0030 NA NA
rfobj$importance all Yes 0
Ethnicity 0.0248968775 NA NA
Age 0.0140963705 NA NA
Gender 0.0128162244 NA NA
Religion 0.0089571442 NA NA
Employment 0.0120478229 NA NA
Income_median 0.0190921015 NA NA
EnglishSpeak 0.0071910506 NA NA
EnglishDiff 0.0384020696 NA NA
See.Family 0.0036459869 NA NA
Close.Family 0.0023719132 NA NA
Helpful.Family 0.0041971735 NA NA
See.Friends 0.0005718011 NA NA
Close.Friends 0.0011714134 NA NA
Helpful.Friends 0.0055246937 NA NA
Family.Respect 0.0041385762 NA NA
Similar.Values 0.0109202600 NA NA
Successful.Family 0.0090357236 NA NA
Trust 0.0066049572 NA NA
Loyalty 0.0030206794 NA NA
Family.Pride -0.0001192607 NA NA
Expression 0.0072266925 NA NA
Spend.Time.Together 0.0010864867 NA NA
Feel.Close 0.0047425012 NA NA
Togetherness 0.0060023659 NA NA
Religious.Attendance 0.0139328764 NA NA
Religious.Importance 0.0077953860 NA NA
Close.knit.Community 0.0150870035 NA NA
Helpful.Community 0.0059667757 NA NA
Community.Shares.Values 0.0089607159 NA NA
Get.Along 0.0095895593 NA NA
Community.Trust 0.0101974130 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
368.0000000 176.0000000 2.0909091 0.3235294 0.9147727 0.2472826
prec npv misclass brier brier.norm auc
0.3675799 0.8584906 0.5367647 0.2206503 0.8826012 0.6578249
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.6316164 0.5244300 0.4433383 0.3235294 0.4660592 0.5000213
F1modgmean gmean
0.4594755 0.4756126
test_rf$importance all Yes 0
Ethnicity 0.0114439693 NA NA
Age 0.0219713731 NA NA
Gender 0.0016219260 NA NA
Religion 0.0027888994 NA NA
Employment 0.0062918989 NA NA
Income_median 0.0229205076 NA NA
EnglishSpeak 0.0033919262 NA NA
EnglishDiff -0.0003063299 NA NA
See.Family -0.0002408119 NA NA
Close.Family -0.0007071544 NA NA
Helpful.Family 0.0003203322 NA NA
See.Friends 0.0011020651 NA NA
Close.Friends -0.0033020947 NA NA
Helpful.Friends -0.0002238848 NA NA
Family.Respect 0.0012973660 NA NA
Similar.Values 0.0001397280 NA NA
Successful.Family 0.0029397819 NA NA
Trust 0.0006694806 NA NA
Loyalty -0.0003052977 NA NA
Family.Pride 0.0049864195 NA NA
Expression 0.0030384659 NA NA
Spend.Time.Together 0.0020002278 NA NA
Feel.Close 0.0027576521 NA NA
Togetherness 0.0018777620 NA NA
Religious.Attendance -0.0019438390 NA NA
Religious.Importance 0.0035575453 NA NA
Close.knit.Community 0.0048280191 NA NA
Helpful.Community 0.0030374539 NA NA
Community.Shares.Values 0.0058025090 NA NA
Get.Along 0.0021470728 NA NA
Community.Trust 0.0021712026 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot
Dental Checkup
ps(`Dentist Check-up`)# A tibble: 3 × 3
`Dentist Check-up` n pct
<fct> <int> <dbl>
1 0 1100 42.2
2 Yes 1462 56.0
3 <NA> 47 1.80
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Dentist Check-up`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(Dentist.Check.up ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")
print(imb) Sample size: 2175
Frequency of class labels: 896, 1279
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 450.5877
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1375
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.4275
(OOB) Brier score: 0.21235502
(OOB) Normalized Brier score: 0.8494201
(OOB) AUC: 0.70813816
(OOB) Log-loss: 0.61336314
(OOB) PR-AUC: 0.60231866
(OOB) G-mean: 0.64945053
(OOB) Requested performance error: 0.35054947
Confusion matrix:
predicted
observed 0 Yes class.error
0 608 288 0.3214
Yes 484 795 0.3784
(OOB) Misclassification rate: 0.3549425
plot(imb,plots.one.page = F)

all 0 Yes
EnglishSpeak 0.0180 NA NA
Close.Family 0.0057 NA NA
Feel.Close 0.0050 NA NA
Helpful.Family 0.0049 NA NA
Family.Pride 0.0040 NA NA
Employment 0.0036 NA NA
Gender 0.0034 NA NA
Successful.Family 0.0021 NA NA
Expression 0.0016 NA NA
Close.Friends 0.0005 NA NA
See.Friends 0.0005 NA NA
Age 0.0002 NA NA
Ethnicity -0.0013 NA NA
Close.knit.Community -0.0016 NA NA
Community.Shares.Values -0.0018 NA NA
Similar.Values -0.0019 NA NA
Community.Trust -0.0022 NA NA
Spend.Time.Together -0.0025 NA NA
Togetherness -0.0027 NA NA
Religious.Importance -0.0033 NA NA
EnglishDiff -0.0034 NA NA
Helpful.Friends -0.0038 NA NA
Loyalty -0.0039 NA NA
Trust -0.0043 NA NA
Income_median -0.0043 NA NA
Get.Along -0.0044 NA NA
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1279.0000000 896.0000000 1.4274554 0.4119540 0.6785714 0.6215794
prec npv misclass brier brier.norm auc
0.5567766 0.7340720 0.3549425 0.2123550 0.8494201 0.7081382
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.6133631 0.6116700 0.6409429 0.4119540 0.6023187 0.6305603
F1modgmean gmean
0.6451967 0.6494505
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
imbal_index <- createDataPartition(rfdata$Dentist.Check.up,p=0.75,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Dentist.Check.up~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Dentist.Check.up~ .,importance=T,data=train,
perf.type = "gmean",splitrule="gini")
print(rfobj) Sample size: 1632
Frequency of class labels: 802, 830
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 268.4903
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1031
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0349
(OOB) Brier score: 0.13575662
(OOB) Normalized Brier score: 0.54302647
(OOB) AUC: 0.91982243
(OOB) Log-loss: 0.44019353
(OOB) PR-AUC: 0.91304878
(OOB) G-mean: 0.84832876
(OOB) Requested performance error: 0.15167124
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 670 132 0.1646
0 115 715 0.1386
(OOB) Misclassification rate: 0.151348
print(rfobj) Sample size: 1632
Frequency of class labels: 802, 830
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 268.4903
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1031
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0349
(OOB) Brier score: 0.13575662
(OOB) Normalized Brier score: 0.54302647
(OOB) AUC: 0.91982243
(OOB) Log-loss: 0.44019353
(OOB) PR-AUC: 0.91304878
(OOB) G-mean: 0.84832876
(OOB) Requested performance error: 0.15167124
Confusion matrix:
predicted
observed Yes 0 class.error
Yes 670 132 0.1646
0 115 715 0.1386
(OOB) Misclassification rate: 0.151348
plot(rfobj,plots.one.page = FALSE)

all Yes 0
Ethnicity 0.0327 NA NA
Income_median 0.0303 NA NA
Religion 0.0265 NA NA
EnglishSpeak 0.0229 NA NA
Gender 0.0158 NA NA
EnglishDiff 0.0156 NA NA
Community.Shares.Values 0.0104 NA NA
Close.Friends 0.0091 NA NA
Get.Along 0.0090 NA NA
Close.knit.Community 0.0080 NA NA
Community.Trust 0.0078 NA NA
Religious.Attendance 0.0077 NA NA
Successful.Family 0.0077 NA NA
Employment 0.0076 NA NA
Religious.Importance 0.0074 NA NA
Helpful.Family 0.0066 NA NA
Age 0.0061 NA NA
Close.Family 0.0049 NA NA
Helpful.Community 0.0042 NA NA
See.Friends 0.0042 NA NA
Similar.Values 0.0042 NA NA
Family.Pride 0.0036 NA NA
Feel.Close 0.0029 NA NA
Spend.Time.Together 0.0023 NA NA
Expression 0.0023 NA NA
Family.Respect 0.0023 NA NA
rfobj$importance all Yes 0
Ethnicity 0.032663624 NA NA
Age 0.006131614 NA NA
Gender 0.015828603 NA NA
Religion 0.026518680 NA NA
Employment 0.007647381 NA NA
Income_median 0.030294863 NA NA
EnglishSpeak 0.022921737 NA NA
EnglishDiff 0.015550752 NA NA
See.Family -0.005575811 NA NA
Close.Family 0.004905287 NA NA
Helpful.Family 0.006649953 NA NA
See.Friends 0.004197575 NA NA
Close.Friends 0.009102336 NA NA
Helpful.Friends 0.001150065 NA NA
Family.Respect 0.002307021 NA NA
Similar.Values 0.004162872 NA NA
Successful.Family 0.007747457 NA NA
Trust 0.001114601 NA NA
Loyalty 0.001745203 NA NA
Family.Pride 0.003639835 NA NA
Expression 0.002307021 NA NA
Spend.Time.Together 0.002340760 NA NA
Feel.Close 0.002903862 NA NA
Togetherness 0.001080911 NA NA
Religious.Attendance 0.007747457 NA NA
Religious.Importance 0.007357942 NA NA
Close.knit.Community 0.007991442 NA NA
Helpful.Community 0.004234059 NA NA
Community.Shares.Values 0.010365281 NA NA
Get.Along 0.009034317 NA NA
Community.Trust 0.007776946 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
319.0000000 224.0000000 1.4241071 0.4125230 0.7767857 0.4514107
prec npv misclass brier brier.norm auc
0.4985673 0.7422680 0.4143646 0.2330666 0.9322663 0.6526009
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.6598971 0.6073298 0.5834643 0.4125230 0.5200296 0.5997432
F1modgmean gmean
0.5878104 0.5921565
test_rf$importance all Yes 0
Ethnicity 0.0190548830 NA NA
Age -0.0008363182 NA NA
Gender 0.0021456891 NA NA
Religion 0.0149186629 NA NA
Employment 0.0061462960 NA NA
Income_median 0.0291057705 NA NA
EnglishSpeak 0.0138818279 NA NA
EnglishDiff 0.0093890066 NA NA
See.Family 0.0009114775 NA NA
Close.Family -0.0012305512 NA NA
Helpful.Family -0.0012548631 NA NA
See.Friends -0.0018495515 NA NA
Close.Friends -0.0025569592 NA NA
Helpful.Friends -0.0002722153 NA NA
Family.Respect 0.0044396784 NA NA
Similar.Values -0.0003858598 NA NA
Successful.Family 0.0025583855 NA NA
Trust 0.0011096618 NA NA
Loyalty 0.0014748035 NA NA
Family.Pride 0.0009339725 NA NA
Expression 0.0006243612 NA NA
Spend.Time.Together 0.0001531622 NA NA
Feel.Close 0.0021725962 NA NA
Togetherness 0.0006275521 NA NA
Religious.Attendance 0.0028457122 NA NA
Religious.Importance -0.0003142404 NA NA
Close.knit.Community -0.0020620379 NA NA
Helpful.Community -0.0004374439 NA NA
Community.Shares.Values 0.0004516296 NA NA
Get.Along -0.0002527406 NA NA
Community.Trust 0.0018926823 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot
Urgent Care
ps(`Urgentcare`)# A tibble: 3 × 3
Urgentcare n pct
<fct> <int> <dbl>
1 0 2112 81.0
2 Yes 440 16.9
3 <NA> 57 2.18
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Urgentcare`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(`Urgentcare` ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")
print(imb) Sample size: 2167
Frequency of class labels: 1808, 359
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 325.9177
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1370
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 5.0362
(OOB) Brier score: 0.13716597
(OOB) Normalized Brier score: 0.5486639
(OOB) AUC: 0.59096526
(OOB) Log-loss: 0.44659192
(OOB) PR-AUC: 0.23502521
(OOB) G-mean: 0.55299713
(OOB) Requested performance error: 0.44700287
Confusion matrix:
predicted
observed 0 Yes class.error
0 863 945 0.5227
Yes 129 230 0.3593
(OOB) Misclassification rate: 0.4956161
plot(imb,plots.one.page = F)

all 0 Yes
Close.Family 0.0224 NA NA
Ethnicity 0.0184 NA NA
Income_median 0.0175 NA NA
Helpful.Family 0.0138 NA NA
Religion 0.0115 NA NA
Helpful.Friends 0.0108 NA NA
Helpful.Community 0.0095 NA NA
Close.Friends 0.0094 NA NA
Age 0.0091 NA NA
Get.Along 0.0082 NA NA
Loyalty 0.0082 NA NA
Community.Trust 0.0080 NA NA
Family.Pride 0.0075 NA NA
Religious.Importance 0.0072 NA NA
Trust 0.0070 NA NA
See.Friends 0.0054 NA NA
Successful.Family 0.0053 NA NA
Similar.Values 0.0047 NA NA
See.Family 0.0035 NA NA
Family.Respect 0.0032 NA NA
Spend.Time.Together 0.0030 NA NA
Feel.Close 0.0029 NA NA
EnglishDiff 0.0028 NA NA
EnglishSpeak 0.0010 NA NA
Employment -0.0006 NA NA
Religious.Attendance -0.0010 NA NA
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1808.0000000 359.0000000 5.0362117 0.1656668 0.6406685 0.4773230
prec npv misclass brier brier.norm auc
0.1957447 0.8699597 0.4956161 0.1371660 0.5486639 0.5909653
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.4465919 0.2998696 0.4034674 0.1656668 0.2350252 0.4264334
F1modgmean gmean
0.4782322 0.5529971
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
imbal_index <- createDataPartition(rfdata$Urgentcare,p=0.75,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Urgentcare~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(Urgentcare~ .,importance=T,data=train,
perf.type = "gmean",splitrule="gini")
print(rfobj) Sample size: 1626
Frequency of class labels: 800, 826
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 257.0357
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1028
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0325
(OOB) Brier score: 0.10820287
(OOB) Normalized Brier score: 0.43281149
(OOB) AUC: 0.97188105
(OOB) Log-loss: 0.37643219
(OOB) PR-AUC: 0.96438371
(OOB) G-mean: 0.91746819
(OOB) Requested performance error: 0.08253181
Confusion matrix:
predicted
observed 0 Yes class.error
0 729 71 0.0887
Yes 63 763 0.0763
(OOB) Misclassification rate: 0.08241082
print(rfobj) Sample size: 1626
Frequency of class labels: 800, 826
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 257.0357
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1028
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0325
(OOB) Brier score: 0.10820287
(OOB) Normalized Brier score: 0.43281149
(OOB) AUC: 0.97188105
(OOB) Log-loss: 0.37643219
(OOB) PR-AUC: 0.96438371
(OOB) G-mean: 0.91746819
(OOB) Requested performance error: 0.08253181
Confusion matrix:
predicted
observed 0 Yes class.error
0 729 71 0.0887
Yes 63 763 0.0763
(OOB) Misclassification rate: 0.08241082
plot(rfobj,plots.one.page = FALSE)

all 0 Yes
Ethnicity 0.0170 NA NA
Religion 0.0170 NA NA
Religious.Attendance 0.0158 NA NA
EnglishDiff 0.0127 NA NA
Get.Along 0.0103 NA NA
Helpful.Family 0.0092 NA NA
Community.Trust 0.0085 NA NA
Community.Shares.Values 0.0084 NA NA
EnglishSpeak 0.0079 NA NA
Income_median 0.0071 NA NA
Religious.Importance 0.0066 NA NA
Helpful.Community 0.0066 NA NA
Employment 0.0060 NA NA
Close.Friends 0.0037 NA NA
Close.knit.Community 0.0035 NA NA
See.Friends 0.0030 NA NA
Feel.Close 0.0030 NA NA
Loyalty 0.0029 NA NA
Successful.Family 0.0029 NA NA
Family.Respect 0.0023 NA NA
Togetherness 0.0017 NA NA
Close.Family 0.0012 NA NA
Similar.Values 0.0011 NA NA
Trust 0.0011 NA NA
Spend.Time.Together 0.0010 NA NA
Helpful.Friends 0.0006 NA NA
rfobj$importance all 0 Yes
Ethnicity 1.699750e-02 NA NA
Age -7.669618e-05 NA NA
Gender -7.032206e-04 NA NA
Religion 1.698406e-02 NA NA
Employment 5.953208e-03 NA NA
Income_median 7.143534e-03 NA NA
EnglishSpeak 7.887759e-03 NA NA
EnglishDiff 1.267868e-02 NA NA
See.Family -9.896162e-05 NA NA
Close.Family 1.203237e-03 NA NA
Helpful.Family 9.163088e-03 NA NA
See.Friends 3.035886e-03 NA NA
Close.Friends 3.664146e-03 NA NA
Helpful.Friends 5.750132e-04 NA NA
Family.Respect 2.341081e-03 NA NA
Similar.Values 1.110752e-03 NA NA
Successful.Family 2.928322e-03 NA NA
Trust 1.074421e-03 NA NA
Loyalty 2.946524e-03 NA NA
Family.Pride 4.858915e-04 NA NA
Expression -2.048803e-03 NA NA
Spend.Time.Together 1.044696e-03 NA NA
Feel.Close 2.966382e-03 NA NA
Togetherness 1.681505e-03 NA NA
Religious.Attendance 1.575473e-02 NA NA
Religious.Importance 6.574343e-03 NA NA
Close.knit.Community 3.505178e-03 NA NA
Helpful.Community 6.562713e-03 NA NA
Community.Shares.Values 8.402837e-03 NA NA
Get.Along 1.026504e-02 NA NA
Community.Trust 8.536022e-03 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance=T)
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
452.00000000 89.00000000 5.07865169 0.16451017 0.97752809 0.01327434
prec npv misclass brier brier.norm auc
0.16322702 0.75000000 0.82809612 0.19174317 0.76697270 0.62026449
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.57101833 0.27974277 0.04772353 0.16451017 0.24471152 0.19682759
F1modgmean gmean
0.08081797 0.11391241
test_rf$importance all 0 Yes
Ethnicity 0.0092679961 NA NA
Age -0.0010239624 NA NA
Gender -0.0003857482 NA NA
Religion 0.0039876778 NA NA
Employment 0.0025635146 NA NA
Income_median 0.0116989391 NA NA
EnglishSpeak 0.0252844657 NA NA
EnglishDiff 0.0032710913 NA NA
See.Family 0.0009597281 NA NA
Close.Family 0.0053102624 NA NA
Helpful.Family -0.0020859738 NA NA
See.Friends -0.0006427318 NA NA
Close.Friends 0.0028938441 NA NA
Helpful.Friends -0.0042392218 NA NA
Family.Respect 0.0014365929 NA NA
Similar.Values 0.0030089055 NA NA
Successful.Family 0.0049482951 NA NA
Trust 0.0085479527 NA NA
Loyalty 0.0054304756 NA NA
Family.Pride 0.0034385977 NA NA
Expression 0.0091406593 NA NA
Spend.Time.Together 0.0116566550 NA NA
Feel.Close 0.0023862014 NA NA
Togetherness 0.0139434459 NA NA
Religious.Attendance 0.0062598150 NA NA
Religious.Importance 0.0035382193 NA NA
Close.knit.Community 0.0044157273 NA NA
Helpful.Community 0.0143630787 NA NA
Community.Shares.Values 0.0001754873 NA NA
Get.Along -0.0063640012 NA NA
Community.Trust -0.0015271133 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot
Folk Medicine
ps(`Folkmedicine`)# A tibble: 3 × 3
Folkmedicine n pct
<fct> <int> <dbl>
1 0 2189 83.9
2 Yes 348 13.3
3 <NA> 72 2.76
Random Forest (randomForestSRC)
#install.packages("randomForestSRC)
rfdata <- qol |>
select(`Folkmedicine`, Ethnicity, Age, Gender,Religion, `Full Time Employment`, Income_median, `English Speaking`, `English Difficulties`,`See Family`:`Community Trust`) %>%
na.omit() |>
rename(Employment=`Full Time Employment`,
EnglishSpeak=`English Speaking`,
EnglishDiff=`English Difficulties`) |>
as.data.frame() |>
rename_with(make.names)
imb <- imbalanced(Folkmedicine ~ .,importance=T,data=rfdata,
perf.type = "gmean",splitrule="gini")
print(imb) Sample size: 2152
Frequency of class labels: 1866, 286
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 271.2393
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1360
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 6.5245
(OOB) Brier score: 0.11194129
(OOB) Normalized Brier score: 0.44776514
(OOB) AUC: 0.66867163
(OOB) Log-loss: 0.37567305
(OOB) PR-AUC: 0.21700153
(OOB) G-mean: 0.62960131
(OOB) Requested performance error: 0.37039869
Confusion matrix:
predicted
observed 0 Yes class.error
0 1037 829 0.4443
Yes 82 204 0.2867
(OOB) Misclassification rate: 0.4233271
plot(imb,plots.one.page = F)

all 0 Yes
Age 0.0537 NA NA
Ethnicity 0.0494 NA NA
EnglishSpeak 0.0124 NA NA
Trust 0.0118 NA NA
Religion 0.0113 NA NA
Community.Trust 0.0110 NA NA
EnglishDiff 0.0108 NA NA
Feel.Close 0.0102 NA NA
Spend.Time.Together 0.0096 NA NA
Similar.Values 0.0092 NA NA
Loyalty 0.0086 NA NA
Community.Shares.Values 0.0083 NA NA
Family.Respect 0.0068 NA NA
Religious.Importance 0.0065 NA NA
Successful.Family 0.0062 NA NA
Helpful.Community 0.0061 NA NA
Helpful.Friends 0.0059 NA NA
Family.Pride 0.0048 NA NA
Income_median 0.0040 NA NA
Employment 0.0040 NA NA
Expression 0.0028 NA NA
Gender 0.0018 NA NA
Togetherness 0.0009 NA NA
See.Family 0.0001 NA NA
Close.knit.Community 0.0000 NA NA
See.Friends -0.0004 NA NA
get.imbalanced.performance(imb) n.majority n.minority iratio threshold sens spec
1866.0000000 286.0000000 6.5244755 0.1328996 0.7132867 0.5557342
prec npv misclass brier brier.norm auc
0.1974831 0.9267203 0.4233271 0.1119413 0.4477651 0.6686716
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.3756730 0.3093252 0.4280739 0.1328996 0.2170015 0.4694633
F1modgmean gmean
0.5288376 0.6296013
var_importance <- imb$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
# Create ggplot for variable importance
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_minimal()
plot(importance_plot)
Training/Test set Variable Importance
Training Importance
set.seed(222)
imbal_index <- createDataPartition(rfdata$Folkmedicine,p=0.75,list=F,times=1)
imbal_train <- rfdata[imbal_index,]
train <- ROSE::ROSE(Folkmedicine~.,
data=imbal_train,
seed=3)$data
test<- rfdata[-imbal_index,]
# rfsrc(Family~.,data=rfdata, importance="permute", perf.type="gmean",block.size = 10) ->rfobj
rfobj <- imbalanced(`Folkmedicine` ~ .,importance=T,data=train,
perf.type = "gmean",splitrule="gini")
print(rfobj) Sample size: 1615
Frequency of class labels: 793, 822
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 231.3443
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1021
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0366
(OOB) Brier score: 0.0867378
(OOB) Normalized Brier score: 0.34695121
(OOB) AUC: 0.98107682
(OOB) Log-loss: 0.31961798
(OOB) PR-AUC: 0.97903926
(OOB) G-mean: 0.92703296
(OOB) Requested performance error: 0.07296704
Confusion matrix:
predicted
observed 0 Yes class.error
0 710 83 0.1047
Yes 33 789 0.0401
(OOB) Misclassification rate: 0.07182663
print(rfobj) Sample size: 1615
Frequency of class labels: 793, 822
Number of trees: 3000
Forest terminal node size: 1
Average no. of terminal nodes: 231.3443
No. of variables tried at each split: 6
Total no. of variables: 31
Resampling used to grow trees: swor
Resample size used to grow trees: 1021
Analysis: RFQ
Family: class
Splitting rule: gini *random*
Number of random split points: 10
Imbalanced ratio: 1.0366
(OOB) Brier score: 0.0867378
(OOB) Normalized Brier score: 0.34695121
(OOB) AUC: 0.98107682
(OOB) Log-loss: 0.31961798
(OOB) PR-AUC: 0.97903926
(OOB) G-mean: 0.92703296
(OOB) Requested performance error: 0.07296704
Confusion matrix:
predicted
observed 0 Yes class.error
0 710 83 0.1047
Yes 33 789 0.0401
(OOB) Misclassification rate: 0.07182663
plot(rfobj,plots.one.page = FALSE)

all 0 Yes
EnglishSpeak 0.0117 NA NA
Religious.Importance 0.0106 NA NA
Community.Trust 0.0089 NA NA
Religious.Attendance 0.0088 NA NA
Religion 0.0075 NA NA
Age 0.0071 NA NA
Ethnicity 0.0068 NA NA
Get.Along 0.0057 NA NA
Close.Family 0.0055 NA NA
EnglishDiff 0.0044 NA NA
Family.Pride 0.0037 NA NA
See.Friends 0.0035 NA NA
Income_median 0.0029 NA NA
Employment 0.0020 NA NA
Gender 0.0018 NA NA
Feel.Close 0.0017 NA NA
Spend.Time.Together 0.0011 NA NA
Successful.Family 0.0010 NA NA
Close.knit.Community 0.0009 NA NA
Helpful.Community 0.0008 NA NA
Trust 0.0007 NA NA
Loyalty 0.0006 NA NA
Family.Respect 0.0005 NA NA
Helpful.Family 0.0005 NA NA
See.Family 0.0004 NA NA
Helpful.Friends -0.0003 NA NA
rfobj$importance all 0 Yes
Ethnicity 0.0067640720 NA NA
Age 0.0070766909 NA NA
Gender 0.0017640986 NA NA
Religion 0.0075320542 NA NA
Employment 0.0020095165 NA NA
Income_median 0.0028814427 NA NA
EnglishSpeak 0.0116840910 NA NA
EnglishDiff 0.0043874728 NA NA
See.Family 0.0004013891 NA NA
Close.Family 0.0054846916 NA NA
Helpful.Family 0.0004618194 NA NA
See.Friends 0.0034717663 NA NA
Close.Friends -0.0014286934 NA NA
Helpful.Friends -0.0003060976 NA NA
Family.Respect 0.0004618194 NA NA
Similar.Values -0.0015459999 NA NA
Successful.Family 0.0009917884 NA NA
Trust 0.0007201414 NA NA
Loyalty 0.0005876596 NA NA
Family.Pride 0.0037209612 NA NA
Expression -0.0013675560 NA NA
Spend.Time.Together 0.0011127301 NA NA
Feel.Close 0.0017019253 NA NA
Togetherness -0.0014286934 NA NA
Religious.Attendance 0.0087508013 NA NA
Religious.Importance 0.0105853089 NA NA
Close.knit.Community 0.0008774882 NA NA
Helpful.Community 0.0008228279 NA NA
Community.Shares.Values -0.0004698596 NA NA
Get.Along 0.0056769849 NA NA
Community.Trust 0.0089086889 NA NA
var_importance <- rfobj$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance", x = "Variable", y = "Importance") +
theme_bw()
plot(importance_plot)
Test Set Importance
test_rf <- predict.rfsrc(rfobj,newdata=test,importance = T)
get.imbalanced.performance(test_rf) n.majority n.minority iratio threshold sens spec
466.00000000 71.00000000 6.56338028 0.13221601 1.00000000 0.03218884
prec npv misclass brier brier.norm auc
0.13601533 1.00000000 0.83985102 0.17768100 0.71072400 0.63974944
logloss F1 F1mod pr.auc.rand pr.auc F1gmean
0.53556189 0.23946037 0.09896390 0.13221601 0.20345993 0.20943643
F1modgmean gmean
0.13918819 0.17941249
test_rf$importance all 0 Yes
Ethnicity 4.873851e-02 NA NA
Age 3.857721e-02 NA NA
Gender -3.894706e-05 NA NA
Religion 8.982006e-03 NA NA
Employment 7.597281e-03 NA NA
Income_median 6.111832e-03 NA NA
EnglishSpeak 1.610365e-02 NA NA
EnglishDiff 1.057510e-02 NA NA
See.Family -1.988875e-03 NA NA
Close.Family -4.847550e-03 NA NA
Helpful.Family -1.618896e-03 NA NA
See.Friends -7.924134e-03 NA NA
Close.Friends -3.134457e-03 NA NA
Helpful.Friends -1.694438e-03 NA NA
Family.Respect 5.171665e-03 NA NA
Similar.Values 1.137199e-02 NA NA
Successful.Family 6.723931e-04 NA NA
Trust -2.195275e-03 NA NA
Loyalty 2.697698e-03 NA NA
Family.Pride 3.843059e-04 NA NA
Expression -2.271595e-05 NA NA
Spend.Time.Together 3.687656e-03 NA NA
Feel.Close 1.138885e-03 NA NA
Togetherness 3.526200e-03 NA NA
Religious.Attendance -6.751275e-03 NA NA
Religious.Importance 3.075361e-03 NA NA
Close.knit.Community -5.567672e-03 NA NA
Helpful.Community 4.667771e-03 NA NA
Community.Shares.Values 7.458798e-03 NA NA
Get.Along -6.150190e-04 NA NA
Community.Trust 2.991126e-03 NA NA
var_importance <- test_rf$importance[, "all"]
var_importance_df <- data.frame(variable = names(var_importance), importance = var_importance)
importance_plot <- ggplot(var_importance_df, aes(x = reorder(variable, importance), y = importance)) +
geom_bar(stat = "identity", fill = "#F8766D") +
coord_flip() +
labs(title = "Variable Importance (Test)", x = "Variable", y = "Importance") +
theme_bw()
importance_plot